This is a project that uses Body Measurement data along with age to predict gender. Using the randomForest algorithm in R, I have created a model with approximately 60% accuracy in the testing data. This model can be refined further by choosing better predictor variables.
The Original data can be found here: https://www.kaggle.com/datasets/saurabhshahane/body-measurements-dataset
The Data has the following properties: Attribute information:
Gender (Male and Female (M=1 & F= 2) (391 Males & 324 Females)
Age (1 year and above)
HeadCircumference (in inches)
ShoulderWidth (in inches)
ChestWidth (in inches)
Belly (in inches)
Waist (in inches)
Hips (in inches)
ArmLength (in inches)
ShoulderToWaist (in inches)
WaistToKnee (in inches)
LegLength (in inches)
TotalHeight - from head to toe (in inches)
Class Label (Not defined)
Dataset Characteristics: Multivariate, Numerical
Attribute Characteristics: Real
Associated Tasks: Classification, Regression
Number of Instances: 13
Number of Attributes: 716
Missing Values: No
Domain: cross domain
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("randomForest")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
install.packages("party")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Loading required package: sandwich
##
## Attaching package: 'strucchange'
##
## The following object is masked from 'package:stringr':
##
## boundary
body_measurements <- read.csv("Body Measurements _ original_CSV.csv")
summary(body_measurements)
## Gender Age HeadCircumference ShoulderWidth
## Min. :1.000 Min. : 1.00 Min. : 5.00 Min. : 4.00
## 1st Qu.:1.000 1st Qu.: 7.00 1st Qu.:19.00 1st Qu.:11.00
## Median :1.000 Median :11.00 Median :20.00 Median :14.00
## Mean :1.453 Mean :15.34 Mean :20.57 Mean :14.32
## 3rd Qu.:2.000 3rd Qu.:21.00 3rd Qu.:22.00 3rd Qu.:18.00
## Max. :2.000 Max. :68.00 Max. :80.00 Max. :87.00
## NA's :1
## ChestWidth Belly Waist Hips
## Min. : 6.00 Min. : 5.0 Min. : 2.00 Min. : 7.00
## 1st Qu.:11.00 1st Qu.: 15.0 1st Qu.:12.00 1st Qu.:12.00
## Median :13.00 Median : 20.0 Median :20.00 Median :18.00
## Mean :14.57 Mean : 20.2 Mean :19.27 Mean :19.38
## 3rd Qu.:17.00 3rd Qu.: 23.0 3rd Qu.:23.00 3rd Qu.:24.00
## Max. :38.00 Max. :213.0 Max. :91.00 Max. :63.00
##
## ArmLength ShoulderToWaist WaistToKnee LegLength
## Min. : 6.00 Min. : 1.0 Min. : 4.00 Min. : 9.00
## 1st Qu.:16.00 1st Qu.:13.0 1st Qu.:13.00 1st Qu.:21.00
## Median :19.00 Median :17.5 Median :16.00 Median :26.00
## Mean :18.82 Mean :17.9 Mean :16.56 Mean :26.83
## 3rd Qu.:22.00 3rd Qu.:22.0 3rd Qu.:20.00 3rd Qu.:32.00
## Max. :66.00 Max. :39.0 Max. :45.00 Max. :50.00
##
## TotalHeight
## Min. :19.00
## 1st Qu.:40.00
## Median :48.00
## Mean :48.12
## 3rd Qu.:55.00
## Max. :89.00
##
glimpse(body_measurements)
## Rows: 716
## Columns: 13
## $ Gender <int> 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, 2, 2, 2, 1…
## $ Age <int> 30, 28, 27, 29, 28, 22, 18, 26, 23, 31, 29, 35, 29, …
## $ HeadCircumference <int> 22, 19, 21, 20, 16, 17, 25, 18, 16, 15, 16, 23, 21, …
## $ ShoulderWidth <int> 18, 22, 18, 20, 14, 19, 17, 15, 16, 20, 28, 21, 19, …
## $ ChestWidth <int> 20, 17, 16, 18, 18, 18, 16, 19, 20, 28, 17, 18, 17, …
## $ Belly <int> 18, 18, 14, 11, 13, 14, 17, 17, 18, 18, 15, 12, 17, …
## $ Waist <int> 14, 21, 10, 19, 11, 16, 12, 23, 22, 91, 21, 18, 20, …
## $ Hips <int> 22, 25, 15, 14, 30, 18, 28, 27, 18, 17, 30, 27, 21, …
## $ ArmLength <int> 22, 28, 21, 24, 25, 20, 23, 19, 15, 16, 17, 18, 20, …
## $ ShoulderToWaist <int> 25, 23, 18, 21, 22, 24, 25, 19, 26, 21, 25, 17, 20, …
## $ WaistToKnee <int> 25, 25, 14, 20, 32, 21, 14, 19, 20, 21, 18, 19, 18, …
## $ LegLength <int> 22, 20, 18, 21, 13, 19, 18, 19, 19, 19, 17, 20, 22, …
## $ TotalHeight <int> 52, 56, 53, 45, 47, 60, 49, 58, 40, 55, 50, 49, 59, …
str(body_measurements)
## 'data.frame': 716 obs. of 13 variables:
## $ Gender : int 1 1 2 1 2 2 2 2 1 1 ...
## $ Age : int 30 28 27 29 28 22 18 26 23 31 ...
## $ HeadCircumference: int 22 19 21 20 16 17 25 18 16 15 ...
## $ ShoulderWidth : int 18 22 18 20 14 19 17 15 16 20 ...
## $ ChestWidth : int 20 17 16 18 18 18 16 19 20 28 ...
## $ Belly : int 18 18 14 11 13 14 17 17 18 18 ...
## $ Waist : int 14 21 10 19 11 16 12 23 22 91 ...
## $ Hips : int 22 25 15 14 30 18 28 27 18 17 ...
## $ ArmLength : int 22 28 21 24 25 20 23 19 15 16 ...
## $ ShoulderToWaist : int 25 23 18 21 22 24 25 19 26 21 ...
## $ WaistToKnee : int 25 25 14 20 32 21 14 19 20 21 ...
## $ LegLength : int 22 20 18 21 13 19 18 19 19 19 ...
## $ TotalHeight : int 52 56 53 45 47 60 49 58 40 55 ...
colnames(body_measurements)
## [1] "Gender" "Age" "HeadCircumference"
## [4] "ShoulderWidth" "ChestWidth" "Belly"
## [7] "Waist" "Hips" "ArmLength"
## [10] "ShoulderToWaist" "WaistToKnee" "LegLength"
## [13] "TotalHeight"
body_measurements_1 <- mutate(body_measurements, Gender_1 = as.factor(Gender))
body_measurements_2 <- select(body_measurements_1,-1)
summary(body_measurements_2)
## Age HeadCircumference ShoulderWidth ChestWidth
## Min. : 1.00 Min. : 5.00 Min. : 4.00 Min. : 6.00
## 1st Qu.: 7.00 1st Qu.:19.00 1st Qu.:11.00 1st Qu.:11.00
## Median :11.00 Median :20.00 Median :14.00 Median :13.00
## Mean :15.34 Mean :20.57 Mean :14.32 Mean :14.57
## 3rd Qu.:21.00 3rd Qu.:22.00 3rd Qu.:18.00 3rd Qu.:17.00
## Max. :68.00 Max. :80.00 Max. :87.00 Max. :38.00
## Belly Waist Hips ArmLength
## Min. : 5.0 Min. : 2.00 Min. : 7.00 Min. : 6.00
## 1st Qu.: 15.0 1st Qu.:12.00 1st Qu.:12.00 1st Qu.:16.00
## Median : 20.0 Median :20.00 Median :18.00 Median :19.00
## Mean : 20.2 Mean :19.27 Mean :19.38 Mean :18.82
## 3rd Qu.: 23.0 3rd Qu.:23.00 3rd Qu.:24.00 3rd Qu.:22.00
## Max. :213.0 Max. :91.00 Max. :63.00 Max. :66.00
## ShoulderToWaist WaistToKnee LegLength TotalHeight Gender_1
## Min. : 1.0 Min. : 4.00 Min. : 9.00 Min. :19.00 1 :391
## 1st Qu.:13.0 1st Qu.:13.00 1st Qu.:21.00 1st Qu.:40.00 2 :324
## Median :17.5 Median :16.00 Median :26.00 Median :48.00 NA's: 1
## Mean :17.9 Mean :16.56 Mean :26.83 Mean :48.12
## 3rd Qu.:22.0 3rd Qu.:20.00 3rd Qu.:32.00 3rd Qu.:55.00
## Max. :39.0 Max. :45.00 Max. :50.00 Max. :89.00
body_measurements_3 <- body_measurements_2 %>% na.omit()
summary(body_measurements_3)
## Age HeadCircumference ShoulderWidth ChestWidth
## Min. : 1.00 Min. : 5.00 Min. : 4.00 Min. : 6.00
## 1st Qu.: 7.00 1st Qu.:19.00 1st Qu.:11.00 1st Qu.:11.00
## Median :11.00 Median :20.00 Median :14.00 Median :13.00
## Mean :15.35 Mean :20.57 Mean :14.32 Mean :14.57
## 3rd Qu.:21.00 3rd Qu.:22.00 3rd Qu.:18.00 3rd Qu.:17.00
## Max. :68.00 Max. :80.00 Max. :87.00 Max. :38.00
## Belly Waist Hips ArmLength
## Min. : 5.0 Min. : 2.00 Min. : 7.00 Min. : 6.00
## 1st Qu.: 15.0 1st Qu.:12.00 1st Qu.:12.00 1st Qu.:16.00
## Median : 20.0 Median :20.00 Median :18.00 Median :19.00
## Mean : 20.2 Mean :19.26 Mean :19.39 Mean :18.81
## 3rd Qu.: 23.0 3rd Qu.:23.00 3rd Qu.:24.00 3rd Qu.:22.00
## Max. :213.0 Max. :91.00 Max. :63.00 Max. :66.00
## ShoulderToWaist WaistToKnee LegLength TotalHeight Gender_1
## Min. : 1.00 Min. : 4.00 Min. : 9.00 Min. :19.00 1:391
## 1st Qu.:13.00 1st Qu.:13.00 1st Qu.:21.00 1st Qu.:40.00 2:324
## Median :18.00 Median :16.00 Median :26.00 Median :48.00
## Mean :17.91 Mean :16.56 Mean :26.84 Mean :48.12
## 3rd Qu.:22.00 3rd Qu.:20.00 3rd Qu.:32.00 3rd Qu.:55.00
## Max. :39.00 Max. :45.00 Max. :50.00 Max. :89.00
set.seed(49879)
index <- sample(2,nrow(body_measurements_3),replace = TRUE, prob=c(0.7,0.3))
Training <- body_measurements_3[index==1,]
Testing <- body_measurements_3[index==2,]
RFM <- randomForest(Gender_1~.,data=Training)
Prediction <- predict(RFM,Testing)
Testing$Gender_pred = Prediction
CFM <- table(Testing$Gender_1,Testing$Gender_pred)
CFM
##
## 1 2
## 1 78 39
## 2 54 54
trees <- ctree(Gender_1 ~ ., data=body_measurements_3)
plot(trees, type="simple")
`